import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

# 设置随机种子以便复现
np.random.seed(42)
random.seed(42)

# 参数设置
num_users = 100
start_date = datetime(2020, 1, 1)
end_date = datetime(2024, 8, 26)
tag_pool = ['搞笑', '科技', '美食', '旅行', '美妆', '健身', '音乐', '萌宠', '游戏', '知识']
output_csv = 'users.csv'

# 生成数据
def random_date(start, end):
    return start + timedelta(days=random.randint(0, (end - start).days))

data = []
for i in range(1, num_users + 1):
    register_time = random_date(start_date, end_date)
    gender = random.choice(['男', '女', '保密'])
    avg_watch_minutes = round(np.random.uniform(5, 180), 2)
    total_watch_count = np.random.randint(10, 10000)
    num_tags = np.random.randint(1, 6)
    favorite_tags = random.sample(tag_pool, num_tags)

    data.append({
        'user_id': f'U{i:05d}',
        'register_time': register_time.strftime('%Y-%m-%d'),
        'gender': gender,
        'avg_watch_minutes': avg_watch_minutes,
        'total_watch_count': total_watch_count,
        'favorite_tags': favorite_tags
    })

# 创建 DataFrame
df = pd.DataFrame(data)

# 查看前几行
print(df.head())

# 保存为 CSV（可选）
df.to_csv(output_csv, index=False, encoding='utf-8-sig')